#!/usr/bin/env perl
#
# A first-stage fasta filter for RepeatScout libraries: removes anything under 50 
# nt.  Removes anything that is over 50% low-complexity vis a vis TRF or NSEG.
#
# Status is printed to STDERR, the resulting "good" repeats to STDOUT.
#
#
# Created: 20050416
# Author: Neil Jones
#-----------------------------------------------------------------------------------

use File::Temp;

if( grep /^--?h$/, @ARGV ) {
	exec("perldoc $0");
}

#
# Same parameters as Alkes' earlier version
#
my $NSEG_THRESHOLD = .5;
my $TRF_THRESHOLD  = .5;
my $MIN_LENGTH     = 50;

my $TRF_COMMAND    = $ENV{'TRF_COMMAND'}  || "trf";
my $NSEG_COMMAND   = $ENV{'NSEG_COMMAND'} || "nseg";


my ($head, $body);
my $repeat_num  = 1;
my $num_deleted = 0;
my $num_skipped = 0;
my $line        = 0;
while(<>) {
	++$line;
	chomp;
	next if /^#/;  # Allow embedded comments.

	if( /^>/ || eof() ) {
                $body .= $_ if eof();
		my $xx = $_;
		my $nseg;
		my $trf;
		if( length($body) > $MIN_LENGTH) {
			my $tmp = new File::Temp( DIR => '.');
			print $tmp "$head (RR=$repeat_num)\n";
			print $tmp "$body\n";
			$nseg = nseg($tmp, $body);
			$trf  = trf($tmp, $body);
			$tmp->close();
			$tmp = undef;  # Remove any reference counts so it's deleted.

			if( $nseg <= $NSEG_THRESHOLD && $trf <= $TRF_THRESHOLD ) 
			{
				printf("$head (RR=$repeat_num.  TRF=%.3f NSEG=%.3f)\n", $trf, $nseg);
				for(my $ii = 0; $ii < length($body); $ii += 80) {
					print substr($body, $ii, 80), "\n";
				}
			}

			else {
				print STDERR "deleting $header: $nseg / $trf\n";
				++$num_deleted;
			}

			++$repeat_num;
		}

		else {
			++$num_skipped;
		}
		$head = $xx;
		$body = '';
	}

	else {
		die("Improperly formatted file: offensive line #$line was\n$_\n") if /[^ATGC]/;
		
		$body .= $_;
	}
}
print STDERR "$num_deleted deleted.  $repeat_num saved. $num_skipped skipped for length.\n";

sub nseg {
	my ($file, $text) = @_;
	local *NSEG;
	my $nseg = $NSEG_COMMAND;
	open(NSEG, "$nseg $file -x -c 100 |");
	my $len = 0;
	while(<NSEG>) {
		chomp;
		next if /^>/;

		s/n//g;
		s/N//g;
		
		$len += 1*length($_);
	}
	close(NSEG);
	return (length($text)-$len) / length($text);
}

sub trf {
	my ($file, $text) = @_;
	local *TRF;
	my $trf = $TRF_COMMAND;

	system("$trf $file 2 7 7 80 10 50 500 -f -d -m > /dev/null");
	open(TRF, "<$file.2.7.7.80.10.50.500.mask") or die($!);
	my $len = 0;

	while(<TRF>) {
		chomp;
		next if /^>/;
		s/n//g;
		s/N//g;
		$len += 1*length($_);
	}
	close(TRF);

	# TRF produces many output files.
	unlink "$file.2.7.7.80.10.50.500.mask";
	unlink "$file.2.7.7.80.10.50.500.dat";
	unlink "$file.2.7.7.80.10.50.500.1.txt.html";
	unlink "$file.2.7.7.80.10.50.500.1.html";

	return (length($text)-$len) / length($text);
}

__END__

=head1 NAME  

filter-stage-1.prl -- a first stage post-processing tool for RepeatScout output.

=head1 SYNOPSIS

cat repeats.fa | filter-stage-1.prl > repeats-filtered.prl

=head1 OPTIONS

none other than "-h" (the output of which you're reading), 
but you will either want trf and nseg in your PATH, or you will want
to set the environment variables TRF_COMMAND and NSEG_COMMAND to provide
the executable.

=head1 DESCRIPTION

This tool takes a repeat library, which is a Fasta-formatted sequence file, and
filters out any sequence that is deemed to be more than 50% low-complexity by either
TRF or NSEG or both.  Note that one algorithm needs to make the determination; we don't
check the total number of unique bases masked out by TRF and NSEG individually.

=head1  ENVIRONMENT VARIABLES

In order for this program to find TRF and NSEG, you need to either place said programs
in your PATH, or you need to add the environment variables TRF_COMMAND and NSEG_COMMAND.
The value of those variables should be the path at which the respective program can be
found.

=cut
